set.seed(42)
library(caret) # highly correlated features removal
## Loading required package: ggplot2
## Loading required package: lattice
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.1
## ✔ dials 1.3.0 ✔ tune 1.2.1
## ✔ infer 1.0.7 ✔ workflows 1.1.4
## ✔ modeldata 1.4.0 ✔ workflowsets 1.1.0
## ✔ parsnip 1.2.1 ✔ yardstick 1.3.2
## ✔ recipes 1.1.0
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ✖ yardstick::precision() masks caret::precision()
## ✖ yardstick::recall() masks caret::recall()
## ✖ yardstick::sensitivity() masks caret::sensitivity()
## ✖ yardstick::spec() masks readr::spec()
## ✖ yardstick::specificity() masks caret::specificity()
## ✖ recipes::step() masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org
library(e1071)
##
## Attaching package: 'e1071'
##
## The following object is masked from 'package:tune':
##
## tune
##
## The following object is masked from 'package:rsample':
##
## permutations
##
## The following object is masked from 'package:parsnip':
##
## tune
Helpers
train_svm <- function(
training_set,
testing_set,
columns,
kernel = "radial",
gamma = if (is.vector(training_set)) 1 else 1 / ncol(training_set),
cost = 1) {
model <- svm(
training_set[columns],
training_set$class,
kernel = kernel, type = "C-classification",
gamma = gamma,
cost = cost,
probability = TRUE,
cross = 10
)
if (is.null(testing_set)) {
return(list(
model = model
))
}
pred <- predict(model, testing_set[columns], probability = TRUE)
set_with_preds <- testing_set %>%
mutate(
pred = pred,
prob_good = attr(pred, "probabilities")[, "good"],
prob_bad = attr(pred, "probabilities")[, "bad"]
)
cm <- confusionMatrix(
set_with_preds$pred, set_with_preds$class,
mode = "everything",
positive = "good"
)
return(list(
model = model,
prediction_set = set_with_preds,
cm = cm
))
}
train_glm <- function(training_set, testing_set, columns) {
formula <- reformulate(colnames(training_set[columns]), "class")
model <- glm(
formula,
training_set,
family = "binomial"
)
pred <- predict(model, testing_set[columns], type = "response")
set_with_preds <- testing_set %>%
mutate(
prob_good = pred,
prob_bad = 1 - pred,
pred = if_else(pred > .5, "good", "bad") %>%
factor(levels = c("bad", "good"))
)
cm <- confusionMatrix(
set_with_preds$pred, set_with_preds$class,
mode = "everything",
positive = "good"
)
return(list(
model = model,
prediction_set = set_with_preds,
cm = cm
))
}
get_mismatch_details <- function(data_with_predictions) {
plot <- data_with_predictions %>%
ggplot(aes(x = prob_good, y = class, color = subcorpus)) +
geom_jitter(height = 0.2, width = 0)
print(plot)
cat("Confusion matrices by subcorpora:\n")
data_with_predictions %>%
select(pred, class, subcorpus) %>%
table() %>%
print()
cat("\n")
deviations <- data_with_predictions %>%
filter(pred != class) %>%
mutate(abs_dev = abs(prob_good - 0.5)) %>%
arrange(-abs_dev)
cat("Greatest deviations:\n")
deviations %>%
select(abs_dev, prob_good, class, subcorpus, FileName) %>%
mutate(across(c(prob_good, abs_dev), ~ round(.x, 3))) %>%
print(n = round(nrow(data_with_predictions) / 5))
cat("Names of highest-deviating documents:\n")
highest_deviation_names <- deviations %>%
filter(abs_dev >= 0.17) %>%
arrange(-abs_dev) %>%
pull(FileName)
print(highest_deviation_names)
return(list(
deviations = deviations,
highest_deviations = highest_deviation_names,
plot = plot
))
}
analyze_outlier <- function(doc_name, variable_importances, dataset) {
important_variables <- sort(variable_importances, decreasing = TRUE) %>%
head(n = 16)
varnames <- names(important_variables)
varscores <- tibble(feat = character(), score = numeric())
for (v in varnames) {
vgood <- filter(dataset, class == "good")[[v]]
vbad <- filter(dataset, class == "bad")[[v]]
vdoc <- filter(dataset, FileName == doc_name)[[v]]
docclass <- filter(dataset, FileName == doc_name)$class
# so that good values are always greater
if (mean(vgood) < mean(vbad)) {
vbad <- -vbad
vgood <- -vgood
vdoc <- -vdoc
}
qgood <- quantile(vgood, probs = c(.25, .75))
qbad <- quantile(vbad, probs = c(.25, .75))
# -2 very bad, -1 bad, 0 medium, +1 good, +2 very good
vscore <- sum(c(
vdoc > qbad[[1]], vdoc > qbad[[2]], vdoc > qgood[[1]], vdoc > qgood[[2]]
)) - 2
varscores <- varscores %>% add_row(feat = v, score = vscore)
}
varscores <- varscores %>%
mutate(verbose_score = case_when(
score == -2 ~ "very bad",
score == -1 ~ "bad",
score == 1 ~ "good",
score == 2 ~ "very good",
.default = "medium"
)) %>%
rowid_to_column("rank") %>%
select(rank, everything())
cat(paste("class", docclass, "and:\n"))
if (docclass == "good") {
print(
varscores %>%
filter(score < 0) %>%
select(rank, feat, verbose_score) %>%
as.data.frame()
)
} else {
print(
varscores %>%
filter(score > 0) %>%
select(rank, feat, verbose_score) %>%
as.data.frame()
)
}
cat("even though:\n")
if (docclass == "good") {
print(
varscores %>%
filter(score >= 0) %>%
select(rank, feat, verbose_score) %>%
as.data.frame()
)
} else {
print(
varscores %>%
filter(score <= 0) %>%
select(rank, feat, verbose_score) %>%
as.data.frame()
)
}
dmut <- dataset %>%
select(KUK_ID, FileName, class, all_of(varnames)) %>%
mutate(across(all_of(varnames), ~ scale(.x))) %>%
pivot_longer(
all_of(varnames),
names_to = "feature", values_to = "value"
) %>%
mutate(across(value, ~ .x[, 1])) %>%
mutate(across(feature, ~ factor(.x, levels = varnames)))
cat(
nrow(dmut %>% filter(value > 5)),
"observation(s) removed from the plot\n"
)
dmutf <- dmut %>% filter(value <= 5)
plot <- dmutf %>%
ggplot(aes(x = class, y = value)) +
facet_wrap(~feature) +
geom_boxplot() +
geom_point(
data = dmut %>% filter(FileName == doc_name), color = "red", size = 5
) +
labs(y = "measurements (scaled)")
return(plot)
}
Load and tidy data
pretty_names <- read_csv("../feat_name_mapping.csv")
## Rows: 85 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): name_orig, name_pretty
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
prettify_feat_name <- function(x) {
name <- pull(pretty_names %>%
filter(name_orig == x), name_pretty)
if (length(name) == 1) {
return(name)
} else {
return(x)
}
}
prettify_feat_name_vector <- function(x) {
map(
x,
prettify_feat_name
) %>% unlist()
}
data <- read_csv("../measurements/measurements.csv")
## Rows: 753 Columns: 108
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (20): fpath, KUK_ID, FileName, FileFormat, FolderPath, subcorpus, Source...
## dbl (85): RuleAbstractNouns, RuleAmbiguousRegards, RuleAnaphoricReferences, ...
## lgl (3): ClarityPursuit, SyllogismBased, Bindingness
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
.firstnonmetacolumn <- 18
data_no_nas <- data %>%
select(!c(
fpath,
# KUK_ID,
# FileName,
FolderPath,
# subcorpus,
DocumentTitle,
ClarityPursuit,
# Readability,
SyllogismBased,
SourceDB
)) %>%
# replace -1s in variation coefficients with NAs
mutate(across(c(
`RuleDoubleAdpos.max_allowable_distance.v`,
`RuleTooManyNegations.max_negation_frac.v`,
`RuleTooManyNegations.max_allowable_negations.v`,
`RuleTooManyNominalConstructions.max_noun_frac.v`,
`RuleTooManyNominalConstructions.max_allowable_nouns.v`,
`RuleCaseRepetition.max_repetition_count.v`,
`RuleCaseRepetition.max_repetition_frac.v`,
`RulePredSubjDistance.max_distance.v`,
`RulePredObjDistance.max_distance.v`,
`RuleInfVerbDistance.max_distance.v`,
`RuleMultiPartVerbs.max_distance.v`,
`RuleLongSentences.max_length.v`,
`RulePredAtClauseBeginning.max_order.v`,
`mattr.v`,
`maentropy.v`
), ~ na_if(.x, -1))) %>%
# replace NAs with 0s
replace_na(list(
RuleGPcoordovs = 0,
RuleGPdeverbaddr = 0,
RuleGPpatinstr = 0,
RuleGPdeverbsubj = 0,
RuleGPadjective = 0,
RuleGPpatbenperson = 0,
RuleGPwordorder = 0,
RuleDoubleAdpos = 0,
RuleDoubleAdpos.max_allowable_distance.v = 0,
RuleAmbiguousRegards = 0,
RuleReflexivePassWithAnimSubj = 0,
RuleTooManyNegations = 0,
RuleTooManyNegations.max_negation_frac.v = 0,
RuleTooManyNegations.max_allowable_negations.v = 0,
RuleTooManyNominalConstructions.max_noun_frac.v = 0,
RuleTooManyNominalConstructions.max_allowable_nouns.v = 0,
RuleFunctionWordRepetition = 0,
RuleCaseRepetition.max_repetition_count.v = 0,
RuleCaseRepetition.max_repetition_frac.v = 0,
RuleWeakMeaningWords = 0,
RuleAbstractNouns = 0,
RuleRelativisticExpressions = 0,
RuleConfirmationExpressions = 0,
RuleRedundantExpressions = 0,
RuleTooLongExpressions = 0,
RuleAnaphoricReferences = 0,
RuleLiteraryStyle = 0,
RulePassive = 0,
RulePredSubjDistance = 0,
RulePredSubjDistance.max_distance.v = 0,
RulePredObjDistance = 0,
RulePredObjDistance.max_distance.v = 0,
RuleInfVerbDistance = 0,
RuleInfVerbDistance.max_distance.v = 0,
RuleMultiPartVerbs = 0,
RuleMultiPartVerbs.max_distance.v = 0,
RuleLongSentences.max_length.v = 0,
RulePredAtClauseBeginning.max_order.v = 0,
RuleVerbalNouns = 0,
RuleDoubleComparison = 0,
RuleWrongValencyCase = 0,
RuleWrongVerbonominalCase = 0,
RuleIncompleteConjunction = 0
)) %>%
# replace NAs with medians
mutate(across(c(
RuleDoubleAdpos.max_allowable_distance,
RuleTooManyNegations.max_negation_frac,
RuleTooManyNegations.max_allowable_negations,
RulePredSubjDistance.max_distance,
RulePredObjDistance.max_distance,
RuleInfVerbDistance.max_distance,
RuleMultiPartVerbs.max_distance
), ~ coalesce(., median(., na.rm = TRUE)))) %>%
# merge GPs
mutate(
GPs = RuleGPcoordovs +
RuleGPdeverbaddr +
RuleGPpatinstr +
RuleGPdeverbsubj +
RuleGPadjective +
RuleGPpatbenperson +
RuleGPwordorder
) %>%
select(!c(
RuleGPcoordovs,
RuleGPdeverbaddr,
RuleGPpatinstr,
RuleGPdeverbsubj,
RuleGPadjective,
RuleGPpatbenperson,
RuleGPwordorder
))
data_clean <- data_no_nas %>%
# norm data expected to correlate with text length
mutate(across(c(
GPs,
RuleDoubleAdpos,
RuleAmbiguousRegards,
RuleFunctionWordRepetition,
RuleWeakMeaningWords,
RuleAbstractNouns,
RuleRelativisticExpressions,
RuleConfirmationExpressions,
RuleRedundantExpressions,
RuleTooLongExpressions,
RuleAnaphoricReferences,
RuleLiteraryStyle,
RulePassive,
RuleVerbalNouns,
RuleDoubleComparison,
RuleWrongValencyCase,
RuleWrongVerbonominalCase,
RuleIncompleteConjunction,
num_hapax,
RuleReflexivePassWithAnimSubj,
RuleTooManyNominalConstructions,
RulePredSubjDistance,
RuleMultiPartVerbs,
RulePredAtClauseBeginning
), ~ .x / word_count)) %>%
mutate(across(c(
RuleTooFewVerbs,
RuleTooManyNegations,
RuleCaseRepetition,
RuleLongSentences,
RulePredObjDistance,
RuleInfVerbDistance
), ~ .x / sent_count)) %>%
# remove variables identified as text-length dependent
select(!c(
RuleTooFewVerbs,
RuleTooManyNegations,
RuleTooManyNominalConstructions,
RuleCaseRepetition,
RuleLongSentences,
RulePredAtClauseBeginning,
syllab_count,
char_count
)) %>%
# remove variables identified as unreliable
select(!c(
RuleAmbiguousRegards,
RuleFunctionWordRepetition,
RuleDoubleComparison,
RuleWrongValencyCase,
RuleWrongVerbonominalCase
)) %>%
# remove further variables belonging to the 'acceptability' category
select(!c(RuleIncompleteConjunction)) %>%
# remove artificially limited variables
select(!c(
RuleCaseRepetition.max_repetition_frac,
RuleCaseRepetition.max_repetition_frac.v
)) %>%
# remove variables with too many NAs
select(!c(
RuleDoubleAdpos.max_allowable_distance,
RuleDoubleAdpos.max_allowable_distance.v
)) %>%
mutate(across(c(
class,
FileFormat,
subcorpus,
DocumentVersion,
LegalActType,
Objectivity,
AuthorType,
RecipientType,
RecipientIndividuation,
Anonymized
), ~ as.factor(.x)))
# no NAs should be present now
data_clean[!complete.cases(data_clean[.firstnonmetacolumn:ncol(data_clean)]), ]
## # A tibble: 0 × 78
## # ℹ 78 variables: KUK_ID <chr>, FileName <chr>, FileFormat <fct>,
## # subcorpus <fct>, SourceID <chr>, Readability <chr>, DocumentVersion <fct>,
## # ParentDocumentID <chr>, LegalActType <fct>, Objectivity <fct>,
## # Bindingness <lgl>, AuthorType <fct>, RecipientType <fct>,
## # RecipientIndividuation <fct>, Anonymized <fct>, Recipient Type <chr>,
## # class <fct>, RuleAbstractNouns <dbl>, RuleAnaphoricReferences <dbl>,
## # RuleCaseRepetition.max_repetition_count <dbl>, …
colnames(data_clean) <- prettify_feat_name_vector(colnames(data_clean))
data_scaled <- data_clean %>%
mutate(across(all_of(.firstnonmetacolumn:ncol(data_clean)), ~ scale(.x)[, 1]))
data_stratified <- data_scaled %>%
unite("strata", c("class", "subcorpus"), remove = FALSE)
Important features identification
feature_importances <- read_csv("../importance_measures/featcomp.csv")
## Rows: 61 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Variable, Sign
## dbl (15): Importance, p_value, estimate, wilcox_p, wilcox_r, kw_p, kw_chi2, ...
## lgl (4): selected_pval, wilcox_sel, kw_sel, selected_reg
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
selected_features_names <- feature_importances %>%
filter(kw_sel) %>%
pull(Variable)
variable_importances <- feature_importances %>%
filter(kw_sel) %>%
pull(kw_epsilon2)
names(variable_importances) <- selected_features_names
Hyperparameters
colsids <- c("all", "cleaned", "readforms")
colsets <- list(columns_all, columns_cleaned, columns_readabilty_forms)
Splits and folds
.splitprop <- 3 / 4
split <- initial_split(data_stratified, .splitprop, strata = strata)
training_set <- training(split)
testing_set <- testing(split)
training_set %>%
select(class) %>%
table()
## class
## bad good
## 310 253
testing_set %>%
select(class) %>%
table()
## class
## bad good
## 104 86
training_set %>%
select(subcorpus, class) %>%
table()
## class
## subcorpus bad good
## CzCDC 157 0
## FrBo 56 171
## KUKY 64 82
## LiFRLaw 3 0
## OmbuFlyers 30 0
testing_set %>%
select(subcorpus, class) %>%
table()
## class
## subcorpus bad good
## CzCDC 54 0
## FrBo 22 58
## KUKY 20 28
## LiFRLaw 0 0
## OmbuFlyers 8 0
Tune
tune_res <- tibble(
columns = character(),
kernel = character(),
gamma = numeric(),
cost = numeric(),
error = numeric(),
dispersion = numeric()
)
# commented out to reduce computation time
# for (coli in seq_along(colsets)) {
# colsid <- colsids[coli]
# columns <- colsets[[coli]]
# message("tune linear on ", colsid)
# tune_linear <- tune.svm(training_set[columns], training_set$class,
# cost = 10^(-3:1),
# kernel = "linear"
# )
# tune_res <- tune_res %>%
# bind_rows(tune_linear$performances %>%
# mutate(kernel = "linear", columns = colsid, gamma = 0))
# message("tune radial on ", colsid)
# tune_radial <- tune.svm(training_set[columns], training_set$class,
# gamma = 10^(-3:3),
# cost = c(0.01, 0.1, 1, 10, 100, 1000),
# kernel = "radial"
# )
# tune_res <- tune_res %>%
# bind_rows(tune_radial$performances %>%
# mutate(kernel = "radial", columns = colsid))
# message("tune polynomial3 on ", colsid)
# tune_polynomial <- tune.svm(training_set[columns], training_set$class,
# gamma = 10^(-3:0),
# degree = 3,
# cost = 10^(-3:1),
# kernel = "polynomial"
# )
# tune_res <- tune_res %>%
# bind_rows(tune_polynomial$performances %>%
# mutate(kernel = "polynomial3", columns = colsid))
# message("tune polynomial4 on ", colsid)
# tune_polynomial <- tune.svm(training_set[columns], training_set$class,
# gamma = 10^(-3:-1),
# degree = 4,
# cost = 10^(-3:1),
# kernel = "polynomial"
# )
# tune_res <- tune_res %>%
# bind_rows(tune_polynomial$performances %>%
# mutate(kernel = "polynomial4", columns = colsid))
# message("tune polynomial5 on ", colsid)
# tune_polynomial <- tune.svm(training_set[columns], training_set$class,
# gamma = 10^(-3:-1),
# degree = 5,
# cost = 10^(-3:0),
# kernel = "polynomial"
# )
# tune_res <- tune_res %>%
# bind_rows(tune_polynomial$performances %>%
# mutate(kernel = "polynomial5", columns = colsid))
# message("tune sigmoid on ", colsid)
# tune_sigmoid <- tune.svm(training_set[columns], training_set$class,
# gamma = 10^(-3:3),
# cost = 10^(-3:3),
# kernel = "sigmoid"
# )
# tune_res <- tune_res %>%
# bind_rows(tune_sigmoid$performances %>%
# mutate(kernel = "sigmoid", columns = colsid))
# }
# tune_res %>% write_csv("tune_results.csv")
tune_res <- read_csv("tune_results.csv")
## Rows: 429 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): columns, kernel
## dbl (5): gamma, cost, error, dispersion, degree
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
tune_res %>%
arrange(error, -dispersion)
## # A tibble: 429 × 7
## columns kernel gamma cost error dispersion degree
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 cleaned radial 0.01 1 0.201 0.0425 NA
## 2 cleaned radial 0.001 100 0.202 0.0444 NA
## 3 all radial 0.001 100 0.206 0.0576 NA
## 4 cleaned radial 0.001 10 0.206 0.0421 NA
## 5 all radial 0.01 1 0.208 0.0440 NA
## 6 all sigmoid 0.001 10 0.210 0.0530 NA
## 7 all radial 0.001 10 0.216 0.0679 NA
## 8 cleaned linear 0 0.1 0.217 0.0285 NA
## 9 all sigmoid 0.001 100 0.222 0.0759 NA
## 10 cleaned linear 0 0.01 0.222 0.0383 NA
## # ℹ 419 more rows
tune_res %>%
arrange(error + dispersion)
## # A tibble: 429 × 7
## columns kernel gamma cost error dispersion degree
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 cleaned radial 0.01 1 0.201 0.0425 NA
## 2 cleaned linear 0 0.1 0.217 0.0285 NA
## 3 cleaned radial 0.001 100 0.202 0.0444 NA
## 4 cleaned radial 0.001 10 0.206 0.0421 NA
## 5 all radial 0.01 1 0.208 0.0440 NA
## 6 cleaned radial 0.01 10 0.240 0.0199 NA
## 7 cleaned linear 0 0.01 0.222 0.0383 NA
## 8 cleaned sigmoid 0.001 10 0.222 0.0389 NA
## 9 all sigmoid 0.001 10 0.210 0.0530 NA
## 10 all radial 0.001 100 0.206 0.0576 NA
## # ℹ 419 more rows
tune_res %>%
filter(columns == "all") %>%
arrange(error, -dispersion)
## # A tibble: 143 × 7
## columns kernel gamma cost error dispersion degree
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 all radial 0.001 100 0.206 0.0576 NA
## 2 all radial 0.01 1 0.208 0.0440 NA
## 3 all sigmoid 0.001 10 0.210 0.0530 NA
## 4 all radial 0.001 10 0.216 0.0679 NA
## 5 all sigmoid 0.001 100 0.222 0.0759 NA
## 6 all sigmoid 0.01 1 0.222 0.0690 NA
## 7 all linear 0 1 0.224 0.0712 NA
## 8 all radial 0.001 1 0.226 0.0458 NA
## 9 all linear 0 0.1 0.226 0.0652 NA
## 10 all linear 0 0.01 0.226 0.0662 NA
## # ℹ 133 more rows
tune_res %>%
filter(str_detect(columns, "notcorrelating.*")) %>%
arrange(error, -dispersion)
## # A tibble: 0 × 7
## # ℹ 7 variables: columns <chr>, kernel <chr>, gamma <dbl>, cost <dbl>,
## # error <dbl>, dispersion <dbl>, degree <dbl>
tune_res %>%
mutate(across(gamma, as.factor)) %>%
ggplot(aes(
x = cost, y = error, ymin = error - dispersion,
ymax = error + dispersion, color = gamma, fill = gamma
)) +
geom_point() +
geom_line() +
geom_ribbon(alpha = 0.1) +
scale_x_log10() +
facet_grid(kernel ~ columns) +
theme(legend.position = "bottom")

SVM cleaned
set.seed(42)
model_cleaned <- train_svm(
training_set, testing_set, columns_cleaned, "radial",
gamma = 0.01, cost = 1
)
model_cleaned$cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction bad good
## bad 83 21
## good 21 65
##
## Accuracy : 0.7789
## 95% CI : (0.7132, 0.8358)
## No Information Rate : 0.5474
## P-Value [Acc > NIR] : 2.6e-11
##
## Kappa : 0.5539
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.7558
## Specificity : 0.7981
## Pos Pred Value : 0.7558
## Neg Pred Value : 0.7981
## Precision : 0.7558
## Recall : 0.7558
## F1 : 0.7558
## Prevalence : 0.4526
## Detection Rate : 0.3421
## Detection Prevalence : 0.4526
## Balanced Accuracy : 0.7769
##
## 'Positive' Class : good
##
mismatches_cleaned <- get_mismatch_details(model_cleaned$prediction_set)

## Confusion matrices by subcorpora:
## , , subcorpus = CzCDC
##
## class
## pred bad good
## bad 54 0
## good 0 0
##
## , , subcorpus = FrBo
##
## class
## pred bad good
## bad 7 7
## good 15 51
##
## , , subcorpus = KUKY
##
## class
## pred bad good
## bad 16 14
## good 4 14
##
## , , subcorpus = LiFRLaw
##
## class
## pred bad good
## bad 0 0
## good 0 0
##
## , , subcorpus = OmbuFlyers
##
## class
## pred bad good
## bad 6 0
## good 2 0
##
##
## Greatest deviations:
## # A tibble: 42 × 5
## abs_dev prob_good class subcorpus FileName
## <dbl> <dbl> <fct> <fct> <chr>
## 1 0.466 0.966 bad FrBo orig_Jaká pravidla platí pro veřejné zaká…
## 2 0.426 0.926 bad FrBo orig_Kompletní průvodce pořizováním nahrá…
## 3 0.385 0.885 bad FrBo orig_Jak řešit lavinovitou černou skládku…
## 4 0.379 0.121 good KUKY AK_JH_Hroch_ustavni_stiznost
## 5 0.376 0.876 bad KUKY PR_Konecny__Miliak
## 6 0.347 0.847 bad KUKY PR_Masinova
## 7 0.335 0.165 good FrBo red_Certifikáty autorizovaných inspektorů
## 8 0.33 0.83 bad FrBo orig_Mohou spolky ve správních žalobách p…
## 9 0.322 0.822 bad FrBo orig_Jaké trestné činy mohou souviset s k…
## 10 0.321 0.821 bad FrBo orig_Sousedské vztahy
## 11 0.318 0.182 good KUKY Mestsky_urad_PRIKAZ
## 12 0.294 0.794 bad FrBo 28
## 13 0.278 0.222 good KUKY Odvolani_proti_rozhodnuti_o_nepovoleni_ka…
## 14 0.275 0.775 bad FrBo orig_Jak se bránit neposkytnutí projektov…
## 15 0.266 0.766 bad FrBo orig_Jak se bránit obtěžování kouřem a pá…
## 16 0.237 0.263 good KUKY Mestsky_urad_usneseni_-_slouceni_pred
## 17 0.236 0.736 bad FrBo 153
## 18 0.225 0.275 good KUKY AK_JH_Podani_US_podpis
## 19 0.189 0.311 good KUKY Mestsky_urad_PRIKAZ_REV2
## 20 0.183 0.683 bad KUKY 043_Plisen-a-zavady-v-byte
## 21 0.16 0.34 good KUKY Mestsky_urad_kontrola_po
## 22 0.159 0.659 bad FrBo orig_Kompletní průvodce občana obtěžované…
## 23 0.153 0.653 bad FrBo 176
## 24 0.151 0.349 good KUKY 6417_2023_VOP
## 25 0.133 0.367 good FrBo red_Co je to úřední deska a jak ji využít
## 26 0.13 0.37 good KUKY Obecni urad_rozhodnuti zadost dle 106pdf
## 27 0.128 0.628 bad FrBo orig_Jak využít svého práva být informová…
## 28 0.122 0.378 good KUKY Mestsky_urad_kontrola_pred
## 29 0.111 0.611 bad FrBo 42
## 30 0.105 0.395 good FrBo 156
## 31 0.089 0.589 bad KUKY Dopis_studentské brigády
## 32 0.064 0.436 good KUKY Mestsky_urad__Vyzva_k_odstraneni_trabanta
## 33 0.06 0.44 good FrBo red_10 významných práv účastníka správníh…
## 34 0.055 0.555 bad OmbuFlyers Pozemkove-urady
## 35 0.048 0.452 good KUKY 6421_2023_VOP
## 36 0.047 0.453 good KUKY Mestsky_urad_Nesoucinnost-U_sroz
## 37 0.042 0.542 bad OmbuFlyers Skolstvi
## 38 0.039 0.539 bad FrBo orig_Předcházení ekologické újmě - jak se…
## # ℹ 4 more rows
## Names of highest-deviating documents:
## [1] "orig_Jaká pravidla platí pro veřejné zakázky malého rozsahu_final"
## [2] "orig_Kompletní průvodce pořizováním nahrávek veřejné správy"
## [3] "orig_Jak řešit lavinovitou černou skládku úprava svépomoc 2021"
## [4] "AK_JH_Hroch_ustavni_stiznost"
## [5] "PR_Konecny__Miliak"
## [6] "PR_Masinova"
## [7] "red_Certifikáty autorizovaných inspektorů"
## [8] "orig_Mohou spolky ve správních žalobách používat věcné argumenty_final"
## [9] "orig_Jaké trestné činy mohou souviset s korupcí"
## [10] "orig_Sousedské vztahy"
## [11] "Mestsky_urad_PRIKAZ"
## [12] "28"
## [13] "Odvolani_proti_rozhodnuti_o_nepovoleni_kaceni"
## [14] "orig_Jak se bránit neposkytnutí projektové dokumentace"
## [15] "orig_Jak se bránit obtěžování kouřem a pálením odpadu"
## [16] "Mestsky_urad_usneseni_-_slouceni_pred"
## [17] "153"
## [18] "AK_JH_Podani_US_podpis"
## [19] "Mestsky_urad_PRIKAZ_REV2"
## [20] "043_Plisen-a-zavady-v-byte"
mismatches_cleaned$plot +
theme_bw() +
labs(y = "true class", x = "probability estimate")

ggsave("model_cleaned_probabilities.pdf")
## Saving 7 x 5 in image
mismatches_cleaned$deviations %>%
select(subcorpus, class) %>%
table()
## class
## subcorpus bad good
## CzCDC 0 0
## FrBo 15 7
## KUKY 4 14
## LiFRLaw 0 0
## OmbuFlyers 2 0
mismatches_cleaned$deviations %>%
select(subcorpus, Readability) %>%
table(useNA = "ifany")
## Readability
## subcorpus high low medium <NA>
## CzCDC 0 0 0 0
## FrBo 7 0 14 1
## KUKY 14 2 2 0
## LiFRLaw 0 0 0 0
## OmbuFlyers 0 0 0 2
mismatches_cleaned$deviations %>%
filter(abs_dev >= 0.17) %>%
select(subcorpus, class) %>%
table()
## class
## subcorpus bad good
## CzCDC 0 0
## FrBo 10 1
## KUKY 3 6
## LiFRLaw 0 0
## OmbuFlyers 0 0
mismatches_cleaned$deviations %>%
filter(abs_dev >= 0.17) %>%
select(subcorpus, Readability) %>%
table(useNA = "ifany")
## Readability
## subcorpus high low medium <NA>
## CzCDC 0 0 0 0
## FrBo 1 0 9 1
## KUKY 6 1 2 0
## LiFRLaw 0 0 0 0
## OmbuFlyers 0 0 0 0
variable_importances_cleaned <- variable_importances[
names(variable_importances) %in% columns_cleaned
]
for (doc in mismatches_cleaned$highest_deviations) {
doc_row <- mismatches_cleaned$deviations %>% filter(FileName == doc)
cat(paste(doc, "/", doc_row["subcorpus"][[1]], "\n"))
cat("KUK_ID:", doc_row["KUK_ID"][[1]], "\n")
cat("dev:", doc_row["abs_dev"][[1]] %>% round(3), "\n")
cat("Readability:", doc_row["Readability"][[1]], "\n")
plt <- analyze_outlier(doc, variable_importances_cleaned, data_clean) +
theme_bw()
print(plt)
ggsave(
paste(
c("outlier_cleaned_", doc_row["KUK_ID"][[1]], ".pdf"),
collapse = ""
), plt,
width = 8,
height = 8
)
}
## orig_Jaká pravidla platí pro veřejné zakázky malého rozsahu_final / FrBo
## KUK_ID: Fart_orig_05482
## dev: 0.466
## Readability: medium
## class bad and:
## rank feat verbose_score
## 1 1 activity good
## 2 2 verbdist good
## 3 3 sentlen.m good
## 4 4 ari very good
## 5 5 gf very good
## 6 6 smog very good
## 7 7 NOUNcount.m very good
## 8 8 VERBfrac.m good
## 9 9 fkgl very good
## 10 11 mamr good
## 11 12 passives very good
## 12 14 literary very good
## 13 15 predsubjdist.m very good
## even though:
## rank feat verbose_score
## 1 10 predorder.m medium
## 2 13 predorder.v bad
## 3 16 compoundVERBs medium
## 16 observation(s) removed from the plot

## orig_Kompletní průvodce pořizováním nahrávek veřejné správy / FrBo
## KUK_ID: Fart_orig_05648
## dev: 0.426
## Readability: medium
## class bad and:
## rank feat verbose_score
## 1 1 activity good
## 2 2 verbdist good
## 3 3 sentlen.m good
## 4 4 ari good
## 5 5 gf good
## 6 6 smog good
## 7 7 NOUNcount.m good
## 8 8 VERBfrac.m good
## 9 10 predorder.m good
## 10 11 mamr good
## 11 13 predorder.v good
## 12 14 literary good
## 13 15 predsubjdist.m good
## 14 16 compoundVERBs very good
## even though:
## rank feat verbose_score
## 1 9 fkgl medium
## 2 12 passives bad
## 16 observation(s) removed from the plot

## orig_Jak řešit lavinovitou černou skládku úprava svépomoc 2021 / FrBo
## KUK_ID: Fart_orig_06078
## dev: 0.385
## Readability: NA
## class bad and:
## rank feat verbose_score
## 1 1 activity very good
## 2 2 verbdist very good
## 3 3 sentlen.m good
## 4 4 ari very good
## 5 5 gf very good
## 6 6 smog very good
## 7 7 NOUNcount.m very good
## 8 8 VERBfrac.m very good
## 9 9 fkgl very good
## 10 10 predorder.m very good
## 11 11 mamr very good
## 12 12 passives very good
## 13 13 predorder.v good
## 14 15 predsubjdist.m good
## 15 16 compoundVERBs very good
## even though:
## rank feat verbose_score
## 1 14 literary medium
## 16 observation(s) removed from the plot

## AK_JH_Hroch_ustavni_stiznost / KUKY
## KUK_ID: 66f19554c6537d54ff062451
## dev: 0.379
## Readability: high
## class good and:
## rank feat verbose_score
## 1 1 activity bad
## 2 2 verbdist bad
## 3 3 sentlen.m bad
## 4 4 ari bad
## 5 5 gf bad
## 6 6 smog bad
## 7 8 VERBfrac.m bad
## 8 9 fkgl bad
## 9 10 predorder.m bad
## 10 11 mamr very bad
## 11 13 predorder.v bad
## 12 16 compoundVERBs bad
## even though:
## rank feat verbose_score
## 1 7 NOUNcount.m good
## 2 12 passives medium
## 3 14 literary medium
## 4 15 predsubjdist.m medium
## 16 observation(s) removed from the plot

## PR_Konecny__Miliak / KUKY
## KUK_ID: 66f19554c6537d54ff062454
## dev: 0.376
## Readability: medium
## class bad and:
## rank feat verbose_score
## 1 1 activity good
## 2 2 verbdist good
## 3 3 sentlen.m good
## 4 4 ari good
## 5 5 gf good
## 6 6 smog good
## 7 7 NOUNcount.m good
## 8 8 VERBfrac.m good
## 9 9 fkgl very good
## 10 10 predorder.m good
## 11 11 mamr good
## 12 13 predorder.v very good
## 13 14 literary good
## 14 15 predsubjdist.m very good
## 15 16 compoundVERBs very good
## even though:
## rank feat verbose_score
## 1 12 passives bad
## 16 observation(s) removed from the plot

## PR_Masinova / KUKY
## KUK_ID: 66f19554c6537d54ff06244c
## dev: 0.347
## Readability: medium
## class bad and:
## rank feat verbose_score
## 1 1 activity good
## 2 4 ari good
## 3 5 gf good
## 4 6 smog good
## 5 7 NOUNcount.m good
## 6 8 VERBfrac.m good
## 7 9 fkgl good
## 8 10 predorder.m good
## 9 11 mamr good
## 10 14 literary good
## 11 15 predsubjdist.m good
## 12 16 compoundVERBs very good
## even though:
## rank feat verbose_score
## 1 2 verbdist bad
## 2 3 sentlen.m bad
## 3 12 passives very bad
## 4 13 predorder.v medium
## 16 observation(s) removed from the plot

## red_Certifikáty autorizovaných inspektorů / FrBo
## KUK_ID: Fart_red_00253
## dev: 0.335
## Readability: high
## class good and:
## rank feat verbose_score
## 1 1 activity bad
## 2 2 verbdist bad
## 3 3 sentlen.m bad
## 4 4 ari bad
## 5 5 gf bad
## 6 6 smog bad
## 7 8 VERBfrac.m bad
## 8 9 fkgl bad
## 9 10 predorder.m bad
## 10 12 passives very bad
## 11 13 predorder.v very bad
## 12 14 literary very bad
## even though:
## rank feat verbose_score
## 1 7 NOUNcount.m medium
## 2 11 mamr very good
## 3 15 predsubjdist.m good
## 4 16 compoundVERBs good
## 16 observation(s) removed from the plot

## orig_Mohou spolky ve správních žalobách používat věcné argumenty_final / FrBo
## KUK_ID: Fart_orig_5938b
## dev: 0.33
## Readability: medium
## class bad and:
## rank feat verbose_score
## 1 1 activity good
## 2 2 verbdist good
## 3 3 sentlen.m good
## 4 5 gf good
## 5 6 smog good
## 6 7 NOUNcount.m very good
## 7 8 VERBfrac.m good
## 8 11 mamr good
## 9 15 predsubjdist.m good
## 10 16 compoundVERBs good
## even though:
## rank feat verbose_score
## 1 4 ari bad
## 2 9 fkgl bad
## 3 10 predorder.m bad
## 4 12 passives very bad
## 5 13 predorder.v bad
## 6 14 literary medium
## 16 observation(s) removed from the plot

## orig_Jaké trestné činy mohou souviset s korupcí / FrBo
## KUK_ID: Fart_orig_00285
## dev: 0.322
## Readability: medium
## class bad and:
## rank feat verbose_score
## 1 2 verbdist good
## 2 7 NOUNcount.m good
## 3 10 predorder.m good
## 4 13 predorder.v good
## 5 14 literary very good
## 6 16 compoundVERBs good
## even though:
## rank feat verbose_score
## 1 1 activity bad
## 2 3 sentlen.m bad
## 3 4 ari bad
## 4 5 gf bad
## 5 6 smog bad
## 6 8 VERBfrac.m medium
## 7 9 fkgl bad
## 8 11 mamr medium
## 9 12 passives medium
## 10 15 predsubjdist.m medium
## 16 observation(s) removed from the plot

## orig_Sousedské vztahy / FrBo
## KUK_ID: Fart_orig_OmVfN
## dev: 0.321
## Readability: medium
## class bad and:
## rank feat verbose_score
## 1 1 activity good
## 2 2 verbdist good
## 3 3 sentlen.m good
## 4 4 ari good
## 5 5 gf good
## 6 6 smog good
## 7 7 NOUNcount.m good
## 8 8 VERBfrac.m good
## 9 10 predorder.m good
## 10 11 mamr good
## 11 13 predorder.v good
## 12 15 predsubjdist.m good
## 13 16 compoundVERBs very good
## even though:
## rank feat verbose_score
## 1 9 fkgl bad
## 2 12 passives bad
## 3 14 literary medium
## 16 observation(s) removed from the plot

## Mestsky_urad_PRIKAZ / KUKY
## KUK_ID: 66f1be84c6537d54ff062490
## dev: 0.318
## Readability: high
## class good and:
## rank feat verbose_score
## 1 1 activity bad
## 2 2 verbdist very bad
## 3 3 sentlen.m very bad
## 4 4 ari very bad
## 5 5 gf very bad
## 6 6 smog bad
## 7 7 NOUNcount.m very bad
## 8 8 VERBfrac.m bad
## 9 9 fkgl very bad
## 10 10 predorder.m bad
## 11 11 mamr bad
## 12 12 passives very bad
## 13 13 predorder.v very bad
## 14 14 literary very bad
## 15 15 predsubjdist.m bad
## even though:
## rank feat verbose_score
## 1 16 compoundVERBs medium
## 16 observation(s) removed from the plot

## 28 / FrBo
## KUK_ID: Fana_00028
## dev: 0.294
## Readability: medium
## class bad and:
## rank feat verbose_score
## 1 1 activity good
## 2 3 sentlen.m good
## 3 4 ari good
## 4 5 gf good
## 5 6 smog good
## 6 7 NOUNcount.m good
## 7 14 literary good
## 8 15 predsubjdist.m good
## 9 16 compoundVERBs very good
## even though:
## rank feat verbose_score
## 1 2 verbdist bad
## 2 8 VERBfrac.m medium
## 3 9 fkgl bad
## 4 10 predorder.m medium
## 5 11 mamr bad
## 6 12 passives medium
## 7 13 predorder.v medium
## 16 observation(s) removed from the plot

## Odvolani_proti_rozhodnuti_o_nepovoleni_kaceni / KUKY
## KUK_ID: 6745acb5c6537d54ff0636ca
## dev: 0.278
## Readability: high
## class good and:
## rank feat verbose_score
## 1 1 activity very bad
## 2 2 verbdist very bad
## 3 3 sentlen.m bad
## 4 4 ari bad
## 5 5 gf very bad
## 6 6 smog very bad
## 7 7 NOUNcount.m very bad
## 8 8 VERBfrac.m very bad
## 9 9 fkgl bad
## 10 10 predorder.m very bad
## 11 11 mamr very bad
## 12 12 passives very bad
## 13 13 predorder.v very bad
## 14 15 predsubjdist.m bad
## 15 16 compoundVERBs bad
## even though:
## rank feat verbose_score
## 1 14 literary good
## 16 observation(s) removed from the plot

## orig_Jak se bránit neposkytnutí projektové dokumentace / FrBo
## KUK_ID: Fart_orig_00259
## dev: 0.275
## Readability: medium
## class bad and:
## rank feat verbose_score
## 1 1 activity good
## 2 2 verbdist good
## 3 3 sentlen.m good
## 4 4 ari good
## 5 7 NOUNcount.m good
## 6 8 VERBfrac.m good
## 7 10 predorder.m good
## 8 11 mamr good
## 9 13 predorder.v very good
## 10 14 literary very good
## 11 16 compoundVERBs very good
## even though:
## rank feat verbose_score
## 1 5 gf medium
## 2 6 smog medium
## 3 9 fkgl medium
## 4 12 passives bad
## 5 15 predsubjdist.m medium
## 16 observation(s) removed from the plot

## orig_Jak se bránit obtěžování kouřem a pálením odpadu / FrBo
## KUK_ID: Fart_orig_05620
## dev: 0.266
## Readability: medium
## class bad and:
## rank feat verbose_score
## 1 2 verbdist good
## 2 10 predorder.m good
## 3 12 passives very good
## 4 13 predorder.v good
## 5 15 predsubjdist.m good
## 6 16 compoundVERBs good
## even though:
## rank feat verbose_score
## 1 1 activity bad
## 2 3 sentlen.m bad
## 3 4 ari bad
## 4 5 gf medium
## 5 6 smog medium
## 6 7 NOUNcount.m bad
## 7 8 VERBfrac.m medium
## 8 9 fkgl bad
## 9 11 mamr bad
## 10 14 literary medium
## 16 observation(s) removed from the plot

## Mestsky_urad_usneseni_-_slouceni_pred / KUKY
## KUK_ID: 66f19554c6537d54ff062453
## dev: 0.237
## Readability: high
## class good and:
## rank feat verbose_score
## 1 1 activity very bad
## 2 2 verbdist very bad
## 3 3 sentlen.m very bad
## 4 4 ari bad
## 5 7 NOUNcount.m very bad
## 6 8 VERBfrac.m very bad
## 7 10 predorder.m very bad
## 8 11 mamr very bad
## 9 13 predorder.v bad
## 10 15 predsubjdist.m bad
## 11 16 compoundVERBs very bad
## even though:
## rank feat verbose_score
## 1 5 gf medium
## 2 6 smog good
## 3 9 fkgl good
## 4 12 passives very good
## 5 14 literary medium
## 16 observation(s) removed from the plot

## 153 / FrBo
## KUK_ID: Fana_00153
## dev: 0.236
## Readability: medium
## class bad and:
## rank feat verbose_score
## 1 3 sentlen.m good
## 2 5 gf good
## 3 6 smog good
## 4 8 VERBfrac.m good
## 5 10 predorder.m good
## 6 12 passives very good
## 7 13 predorder.v very good
## 8 15 predsubjdist.m good
## 9 16 compoundVERBs good
## even though:
## rank feat verbose_score
## 1 1 activity bad
## 2 2 verbdist bad
## 3 4 ari bad
## 4 7 NOUNcount.m bad
## 5 9 fkgl medium
## 6 11 mamr bad
## 7 14 literary bad
## 16 observation(s) removed from the plot

## AK_JH_Podani_US_podpis / KUKY
## KUK_ID: 66f19554c6537d54ff06244f
## dev: 0.225
## Readability: high
## class good and:
## rank feat verbose_score
## 1 1 activity bad
## 2 2 verbdist bad
## 3 3 sentlen.m bad
## 4 4 ari very bad
## 5 5 gf very bad
## 6 6 smog very bad
## 7 8 VERBfrac.m bad
## 8 9 fkgl very bad
## 9 10 predorder.m bad
## 10 12 passives bad
## 11 14 literary bad
## 12 15 predsubjdist.m bad
## 13 16 compoundVERBs bad
## even though:
## rank feat verbose_score
## 1 7 NOUNcount.m good
## 2 11 mamr good
## 3 13 predorder.v medium
## 16 observation(s) removed from the plot

## Mestsky_urad_PRIKAZ_REV2 / KUKY
## KUK_ID: 66f1be84c6537d54ff062492
## dev: 0.189
## Readability: high
## class good and:
## rank feat verbose_score
## 1 1 activity bad
## 2 2 verbdist very bad
## 3 3 sentlen.m very bad
## 4 4 ari bad
## 5 5 gf bad
## 6 6 smog bad
## 7 7 NOUNcount.m bad
## 8 8 VERBfrac.m bad
## 9 9 fkgl bad
## 10 10 predorder.m bad
## 11 11 mamr bad
## 12 12 passives bad
## 13 13 predorder.v very bad
## 14 14 literary bad
## 15 16 compoundVERBs bad
## even though:
## rank feat verbose_score
## 1 15 predsubjdist.m medium
## 16 observation(s) removed from the plot

## 043_Plisen-a-zavady-v-byte / KUKY
## KUK_ID: 673b7a38c6537d54ff062bb3
## dev: 0.183
## Readability: low
## class bad and:
## rank feat verbose_score
## 1 1 activity good
## 2 2 verbdist good
## 3 3 sentlen.m good
## 4 4 ari very good
## 5 5 gf very good
## 6 6 smog very good
## 7 8 VERBfrac.m very good
## 8 9 fkgl very good
## 9 10 predorder.m very good
## 10 11 mamr good
## 11 12 passives very good
## 12 13 predorder.v very good
## 13 15 predsubjdist.m good
## even though:
## rank feat verbose_score
## 1 7 NOUNcount.m medium
## 2 14 literary very bad
## 3 16 compoundVERBs medium
## 16 observation(s) removed from the plot
